###############################################################################-
# Created By: Pietryka
# Creation Date:  2017-03-15
# Updated Date: 2019-04-30
# Purpose: calculate similarity scores for pairs of documents
# Contact: mpietryka@fsu.edu
###############################################################################-


# 1. LOAD DATA AND PACKAGES   ----------------

## 1A. LOAD PACKAGES  ------------

library(tidyverse)   # DATA CLEANING FUNCTIONS
library(stringr)     # STRING FUNCTIONS



## 1B. LOAD DATA ----------------

# 'dyads_df' OBJECT DATA CREATED IN 'SC-1- Dyadic Data.R'
dyads_df <- read_rds("../Data/Derived/dyads_df.rds")
attr(dyads_df, "source")

# 'all_*grams' DF OBJECTS CREATED IN 'SC-3- Extract Innovative Text.R'
all_threegrams <- read_rds("../Data/Derived/all_threegrams.rds")
all_fivegrams <- read_rds("../Data/Derived/all_fivegrams.rds")
all_sevengrams <- read_rds("../Data/Derived/all_sevengrams.rds")
all_fivegrams_nostop <- read_rds("../Data/Derived/all_fivegrams_nostop.rds")


# 'original_*grams' DF OBJECTS CREATED IN 'SC-3- Extract Innovative Text.R'
original_threegrams <- read_rds("../Data/Derived/original_threegrams.rds")
original_fivegrams <- read_rds("../Data/Derived/original_fivegrams.rds")
original_sevengrams <- read_rds("../Data/Derived/original_sevengrams.rds")
original_fivegrams_nostop <- read_rds("../Data/Derived/original_fivegrams_nostop.rds")

# 'borrowed_*grams' DF OBJECTS CREATED IN 'SC-3- Extract Innovative Text.R'
borrowed_threegrams <- read_rds("../Data/Derived/borrowed_threegrams.rds")
borrowed_fivegrams <- read_rds("../Data/Derived/borrowed_fivegrams.rds")
borrowed_sevengrams <- read_rds("../Data/Derived/borrowed_sevengrams.rds")
borrowed_fivegrams_nostop <- read_rds("../Data/Derived/borrowed_fivegrams_nostop.rds")



## 1C. DEFINE FUNCTIONS ----------------


# FUNCTION TO CALCULATE RATIO
get_ratio <- function(from_ngram, to_ngram){
  from_ngram <- from_ngram$ngram
  to_ngram <- to_ngram$ngram

  length(which(to_ngram %in% from_ngram))/length(to_ngram)
}

# FUNCTION TO CALCULATE JACCARD SIMILARITY
get_jaccard <- function(from_ngram, to_ngram){
  from_ngram <- from_ngram$ngram
  to_ngram <- to_ngram$ngram

  length(intersect(from_ngram, to_ngram))/length(union(from_ngram, to_ngram))
}


# FUNCTION TO SUBSET DATA, CREATE NESTED DF
nest_data <- function(df, to = TRUE){
  if (to == TRUE) {
    df %>%
      select(to = document_id, ngram)  %>%
      group_by(to)  %>%
      nest(to_ngram = ngram)
  } else{
    df  %>%
      select(from = document_id, ngram)  %>%
      group_by(from)  %>%
      nest(from_ngram = ngram)
  }
}

# FUNCTION TO CREATE DF WITH SIMILARITY SCORES FOR EACH DYAD
get_similarity <- function(from_df, to_df){
  dyads_df  %>%
    distinct(to, from, .keep_all = TRUE)  %>%
    select(to, from, date_to, date_from)  %>%
    filter(date_to > date_from)  %>%
    left_join(to_df)  %>%
    left_join(from_df)  %>%
    mutate(
      ratio = map2_dbl(from_ngram, to_ngram, get_ratio),
      jaccard = map2_dbl(from_ngram, to_ngram, get_jaccard)
    ) %>%
    # SOURCE DOC'S SUM OF SIMILARITY ACROSS ALL SOURCES MINUS SIMILARITY IN DYAD
    # AT TIME OF FOCAL DOC'S RATIFICATION
    arrange(date_to, date_from)  %>%
    group_by(from)  %>%
    mutate(
      ratio_sum_from = cumsum(ratio) - ratio,
      jaccard_sum_from = cumsum(jaccard) - jaccard
    )  %>%
    # FOCAL DOC'S SUM OF SIMILARITY ACROSS ALL SOURCES MINUS SIMILARITY IN DYAD
    group_by(to)  %>%
    mutate(
      ratio_sum_to = sum(ratio) - ratio,
      jaccard_sum_to = sum(jaccard) - jaccard
      )  %>%
    ungroup() %>%
    select(from, to, starts_with("ratio"), starts_with("jaccard"))
}


# Function to save data and add attribute with label

save_rds <- function(the_name){

  the_source <- "Created in 'DataClean/SC-4- Measure Similarities.R'"

  the_object <- get(the_name)

  attr(the_object, "source") <- the_source

  the_path <- paste0("../Data/Derived/", the_name, ".rds")

  write_rds(the_object, path = the_path)
}


# 2. SUBSET DATA ----------------


# THREEGRAM DATA
to_full3 <- nest_data(all_threegrams, to = TRUE)
from_new3 <- nest_data(original_threegrams, to = FALSE)
from_full3 <- nest_data(all_threegrams, to = FALSE)
to_borrowed3 <- nest_data(borrowed_threegrams, to = TRUE)

# FIVEGRAM DATA
to_full <- nest_data(all_fivegrams, to = TRUE)
from_new <- nest_data(original_fivegrams, to = FALSE)
from_full <- nest_data(all_fivegrams, to = FALSE)
to_borrowed <- nest_data(borrowed_fivegrams, to = TRUE)

# SEVENGRAM DATA
to_full7 <- nest_data(all_sevengrams, to = TRUE)
from_new7 <- nest_data(original_sevengrams, to = FALSE)
from_full7 <- nest_data(all_sevengrams, to = FALSE)
to_borrowed7 <- nest_data(borrowed_sevengrams, to = TRUE)

# FIVEGRAM DATA THAT HAS NOT BEEN STEMMED AND STOP WORDS REMAIN
to_full_nostop <- nest_data(all_fivegrams_nostop, to = TRUE)
from_new_nostop <- nest_data(original_fivegrams_nostop, to = FALSE)
from_full_nostop <- nest_data(all_fivegrams_nostop, to = FALSE)
to_borrowed_nostop <- nest_data(borrowed_fivegrams_nostop, to = TRUE)


# 3.  CALCULATE SIMILARITIES (FIVE-GRAMS) -----------

# FROM INNOVATIVE TO FULL
new_to_full_df <- get_similarity(from_new, to_full)
summary(new_to_full_df$ratio)

# FROM FULL TO FULL
full_to_full_df <- get_similarity(from_full, to_full)
summary(full_to_full_df$ratio)

# FROM FULL TO BORROWED
full_to_borrowed_df <- get_similarity(from_full, to_borrowed)
summary(full_to_borrowed_df$ratio)

# FROM NEW TO BORROWED
new_to_borrowed_df <- get_similarity(from_new, to_borrowed)
summary(new_to_borrowed_df$ratio)



# 4. CALCULATE SIMILARITIES (THREE-GRAMS) -----------

# FROM INNOVATIVE TO FULL
new_to_full_df3 <- get_similarity(from_new3, to_full3)
# FROM FULL TO FULL
full_to_full_df3 <- get_similarity(from_full3, to_full3)
# FROM FULL TO BORROWED
full_to_borrowed_df3 <- get_similarity(from_full3, to_borrowed3)
# FROM NEW TO BORROWED
new_to_borrowed_df3 <- get_similarity(from_new3, to_borrowed3)


#  5. CALCULATE SIMILARITIES (SEVEN-GRAMS) -----------

# FROM INNOVATIVE TO FULL
new_to_full_df7 <- get_similarity(from_new7, to_full7)
# FROM FULL TO FULL
full_to_full_df7 <- get_similarity(from_full7, to_full7)
# FROM FULL TO BORROWED
full_to_borrowed_df7 <- get_similarity(from_full7, to_borrowed7)
# FROM NEW TO BORROWED
new_to_borrowed_df7 <- get_similarity(from_new7, to_borrowed7)





# 6. CALCULATE SIMILARITIES (FIVE-GRAMS, WITHOUT STEMMING OR REMOVING STOP WORDS) -----------

# FROM INNOVATIVE TO FULL
new_to_full_df_nostop <- get_similarity(from_new_nostop, to_full_nostop)
summary(new_to_full_df_nostop$ratio)

# FROM FULL TO FULL
full_to_full_df_nostop <- get_similarity(from_full_nostop, to_full_nostop)
summary(full_to_full_df_nostop$ratio)

# FROM FULL TO BORROWED
full_to_borrowed_df_nostop <- get_similarity(from_full_nostop, to_borrowed_nostop)
summary(full_to_borrowed_df_nostop$ratio)

# FROM NEW TO BORROWED
new_to_borrowed_df_nostop <- get_similarity(from_new_nostop, to_borrowed_nostop)
summary(new_to_borrowed_df_nostop$ratio)




# 7. SAVE  ------------------


# THREE-GRAM DATA
save_rds("new_to_full_df3")
save_rds("full_to_full_df3")
save_rds("full_to_borrowed_df3")
save_rds("new_to_borrowed_df3")

# FIVE-GRAM DATA
save_rds("new_to_full_df")
save_rds("full_to_full_df")
save_rds("full_to_borrowed_df")
save_rds("new_to_borrowed_df")


# SEVEN-GRAM DATA
save_rds("new_to_full_df7")
save_rds("full_to_full_df7")
save_rds("full_to_borrowed_df7")
save_rds("new_to_borrowed_df7")


# FIVEGRAM DATA THAT HAS NOT BEEN STEMMED AND STOP WORDS REMAIN
save_rds("new_to_full_df_nostop")
save_rds("full_to_full_df_nostop")
save_rds("full_to_borrowed_df_nostop")
save_rds("new_to_borrowed_df_nostop")


# DISPLAY VERSION NUMBERS FOR R & PACKAGES IN USE
sessionInfo()

beepr::beep(sound = 8)
